1 Load libraries and source code

rm(list=ls())

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
require(data.table)
library(dplyr)
library(tidyr)
require(parallel)
library(bdc)
library(taxadb)
library(traitdataform)
library(pbapply)
library(tidyverse)
library(readxl)
library(lme4)
## Warning: package 'lme4' was built under R version 4.3.2
library(coefplot)
library(sjPlot)
library(sjmisc)
library(effects)
library(rgdal)
library(maptools)
# library(rgeos)
library(terra)
## Warning: package 'terra' was built under R version 4.3.2
library(MuMIn)
# library(nlme)
# library(pscl)
library(lsmeans)
# library(arm)
# library(MCMCglmm)
# library(PerformanceAnalytics)
library(GGally)
# library(broom.mixed)
library(tidyterra)
## Warning: package 'tidyterra' was built under R version 4.3.2
# library(rnaturalearth)

2 Functions

3 source functions

# Code to find accepted species names
# Before you can download the source code from github, make sure you have a personal github token
# run this:
# usethis::edit_r_environ()
# if you have no toke, create one following:
# 1. Generate on GitHub your personal token
# 1.1 Go to GitHub
# 2.1 In the right corner go to "Settings"
# 2.2 Then in the left part go to "Developer setting"
# 2.3 Select the option "Personal access tokens"
# 2.4 Select the option "Generate new token"
# 2.5 Copy your personal token
# run this to add the token to your .Renviron
# usethis::edit_r_environ()
# write GITHUB_PAT=YOUR_TOKEN
# Sys.getenv("GITHUB_PAT")

# source(here::here("R/fetchGHdata.R"))
# 
# fetchGHdata(gh_account = "Bioshifts", 
#             repo = "bioshifts_v1_v2", 
#             path = "R/Source_code/Clean_names.R",
#             output = here::here("R/Clean_names.R"))
# 
# fetchGHdata(gh_account = "Bioshifts", 
#             repo = "bioshifts_v1_v2", 
#             path = "R/Source_code/Find_Sci_Names.R",
#             output = here::here("R/Find_Sci_Names.R"))

source(here::here("R/Clean_names.R"))
source(here::here("R/Find_Sci_Names.R"))
source(here::here("R/mclapply_hack.R"))
## 
##     *** Microsoft Windows detected ***
##     
##     For technical reasons, the MS Windows version of mclapply()
##     is implemented as a serial function instead of a parallel
##     function.    
## 
##     As a quick hack, we replace this serial version of mclapply()
##     with a wrapper to parLapply() for this R session. Please see
## 
##       http://www.stat.cmu.edu/~nmv/2014/07/14/implementing-mclapply-on-windows 
## 
##     for details.
# Malin's transformation for moving values slightly inward. 
transform01 <- function(x) (x * (length(x) - 1) + 0.5) / (length(x))

# De Kort transformation
deKort_trans <- function(p){
  p <- scale(p)
  p <- (p - min(p, na.rm = T))/(max(p, na.rm = T)-min(p, na.rm = T))
  return(p)
}

# Malin's transformation before a logit transformation
logit_trans <- function(p){
  p <- transform01(p)
  p <- log(p/(1-p))
  return(p)
}

4 Load species list

splist <- read.csv(here::here("Data/splist.csv"), header = T)
# remove duplicated sp_names
splist <- splist %>%
    dplyr::filter(!duplicated(scientificName))%>%
    mutate(Kingdom=kingdom,
           Phylum=phylum,
           Class=class,
           Order=order,
           Family=family)%>%
    dplyr::select(scientificName,Kingdom,Phylum,Class,Order,Family,db)

splist$Genus <- sapply(splist$scientificName, function(x){
    strsplit(x," ")[[1]][1]
})

5 Load bioshifts

5.1 Geo data

if(!file.exists(here::here("data/bioshiftsv3_metadata.csv"))){
  
  # # From Google drive
  # drive_path <- readWindowsShortcut("G:/My Drive.lnk")$pathname
  # bioshifts_path <- readWindowsShortcut(paste0(drive_path,"\\BIOSHIFTS Working Group.lnk"))$pathname
  # shp_path <- here::here(bioshifts_path,("DATA/BIOSHIFTS_v3/CleanDatabasev3/ShapefilesBioShiftsv3"))
  # all_shps <- list.files(shp_path,pattern = "dbf", full.names = TRUE)
  
  # From local folder
  all_shps <- list.files("C:/Users/brunn/ShadowDrive/CreateGeodatabaseBioShifts/Data/ShapefilesBioShiftsv3",
                         pattern = "shp", full.names = TRUE)
  
  biov1_meta <- pblapply(all_shps, terra::vect)
  biov1_meta <- pblapply(biov1_meta, data.frame)
  
  biov1_meta <- data.table::rbindlist(biov1_meta)
  
  write.csv(biov1_meta, here::here("data/bioshiftsv3_metadata.csv"),row.names = FALSE)
  
} else {
  
  biov1_meta <- read.csv(here::here("data/bioshiftsv3_metadata.csv"))
  
}

5.2 Load v1

biov1 <- read.csv(here::here("Data/biov1_fixednames.csv"), header = T)

# Fix references in biov1
biov1$sp_name_std_v1 <- gsub("_"," ",biov1$sp_name_std_v1)

# Select columns of interest
biov1 <- biov1 %>%
  filter(!Type %in% c("BAT","LON")) %>%
  dplyr::select(ID,Article_ID,Study_ID,
                Type,Param,Trend,SHIFT,UNIT,DUR,
                v.lat.mean,v.ele.mean,trend.mean,
                START,END,Sampling,Uncertainty_Distribution,Uncertainty_Parameter,Nperiodes,
                N,Grain_size,Data,ID.area,
                Phylum,Class,Order,Family,Genus,sp_name_std_v1,
                group,ECO,Hemisphere) %>% # select columns
  mutate(
    Type = case_when(
      Type=="HOR" ~ "LAT",
      TRUE ~ as.character(Type)),
    Data = case_when(
      Data=="occurence-based" ~ "occurrence-based",
      TRUE ~ as.character(Data)),
    spp = sp_name_std_v1,
    SHIFT_abs = abs(SHIFT),
    velocity = ifelse(Type == "LAT", v.lat.mean, v.ele.mean),
    vel_sign = ifelse(velocity>0,"pos","neg"))

# add geospatial data
biov1_meta$ID <- biov1_meta$Name

biov1 <- biov1 %>%
  filter(ID %in% biov1_meta$ID)

# merge
biov1 <- merge(biov1,biov1_meta,by="ID",all.y = TRUE) # all.y to keep only studies with geospatial data

# Get Extent (=Lat/Ele extent)
biov1$Extent <- biov1$LatExtentk
biov1$Extent[which(biov1$Type=="ELE")] <- biov1$EleExtentm[which(biov1$Type=="ELE")]

# Number of temporal units
biov1$NtempUnits <- biov1$Nperiodes
these <- which(biov1$NtempUnits > biov1$DUR)
biov1$NtempUnits[these] <- biov1$DUR[these]

# log NtempUnits
biov1$LogNtempUnits <- log(biov1$NtempUnits)

## group irregular sampling with multiple sampling  
biov1$Sampling = ifelse(biov1$Sampling %in% c("IRR","MULTIPLE(continuous)"),"MULT", biov1$Sampling)

## Grain
biov1$Grain <- factor(biov1$Grain_size, levels = c("small", "moderate", "large","very_large"))
biov1$ContinuousGrain <- 
  as.numeric(
    as.character(
      ifelse(
        biov1$Grain == "small", 1, 
        ifelse(biov1$Grain == "moderate",
               2, #10,
               ifelse(biov1$Grain == "large",
                      3, #100,
                      4))))) #we have a few very large in the full database

#harmonize column names
biov1$Quality <- ifelse(biov1$Uncertainty_Distribution %in% c("RESAMPLING"),"BALANCED",
                        ifelse(biov1$Uncertainty_Distribution %in% c("MODEL","MODEL+RESAMPLING(same)","RESAMPLING(same)+MODEL","RESAMPLING+MODEL"),"BALANCED",
                               ifelse(biov1$Uncertainty_Distribution %in% c("DETECTABILITY","RESAMPLING(same)+DETECTABILITY"),"BALANCED",
                                      ifelse(biov1$Uncertainty_Distribution %in% c("RESAMPLING(same)"),"RESURVEYED",biov1$Uncertainty_Distribution))))

table(biov1$Quality) 
## 
##   BALANCED        RAW RESURVEYED 
##      14050       9497       6522
biov1$PrAb <- ifelse(biov1$Data %in% c("occurence-based", "occurrence-based"),"occurrence","abundance")
table(biov1$PrAb) 
## 
##  abundance occurrence 
##       9544      20571
biov1 <- biov1 %>%
  dplyr::filter(!is.na(sp_name_std_v1))

all(biov1$sp_name_std_v1 %in% splist$scientificName)
## [1] TRUE
# from continuous to categorical
q1=quantile(biov1$START,probs=c(0,0.25,0.5,0.75,1))
biov1$StartF=cut(biov1$START,breaks=q1,include.lowest=T)

q1=quantile(biov1$Extent ,probs=c(0,0.25,0.5,0.75,1))
biov1$ExtentF=cut(biov1$Extent,breaks=q1,include.lowest=T)

q1=quantile(biov1$ID.area,probs=c(0,0.25,0.5,0.75,1))
biov1$AreaF=cut(biov1$ID.area,breaks=q1,include.lowest=T)

q1=quantile(biov1$N,probs=c(0,0.25,0.5,0.75,1))
biov1$NtaxaF=cut(biov1$N,breaks=q1,include.lowest=T)

# add ID to obs
biov1$obs_ID <- paste0("S",1:nrow(biov1))

summary(biov1$velocity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -5.8328  0.5415  2.0271  2.3071  2.8319 13.1259       1
summary(biov1$velocity[which(biov1$vel_sign=="pos")])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##  0.009396  0.541507  2.055833  2.434770  2.831932 13.125885

5.3 Classify species

# Class species as Terrestrial, Marine or Freshwater
Terv1 <- unique(biov1$sp_name_std_v1[which(biov1$ECO == "T")])
Terrestrials = unique(c(Terv1))

Mar1 <- unique(biov1$sp_name_std_v1[which(biov1$ECO == "M")])
Marine = unique(c(Mar1))

# Freshwater fish
FFishv1 <- unique(biov1$sp_name_std_v1[(biov1$Class == "Actinopterygii" | biov1$Class == "Cephalaspidomorphi") & biov1$ECO == "T"])
FFish = unique(c(FFishv1))
# Marine fish
MFishv1 <- unique(biov1$sp_name_std_v1[biov1$Class == "Actinopterygii" & biov1$ECO == "M"])
MFish = unique(c(MFishv1))

splist$ECO = NA
splist$ECO[which(splist$scientificName %in% Terrestrials)] <- "T"
splist$ECO[which(splist$scientificName %in% Marine)] <- "M"
splist$ECO[which(splist$scientificName %in% MFish)] <- "M"

biov1$ECO = NA
biov1$ECO[which(biov1$sp_name_std_v1 %in% Terrestrials)] <- "T"
biov1$ECO[which(biov1$sp_name_std_v1 %in% Marine)] <- "M"
biov1$ECO[which(biov1$sp_name_std_v1 %in% MFish)] <- "M"

splist$Group = NA
splist$Group[which(splist$Class == "Phaeophyceae")] <- "Chromista"
splist$Kingdom[which(splist$Class == "Phaeophyceae")] <- "Chromista"
splist$Group[which(splist$Phylum == "Rhodophyta")] <- "Seaweed"
splist$Kingdom[which(splist$Phylum == "Rhodophyta")] <- "Plantae"
splist$Group[which(splist$Family == "Elminiidae")] <- "Barnacles"
splist$Group[which(splist$Kingdom == "Bacteria")] <- "Bacteria"
splist$Group[which(splist$Class == "Holothuroidea")] <- "Sea cucumber"
splist$Group[which(splist$Class == "Aves")] <- "Bird"
splist$Group[which(splist$Class == "Insecta")] <- "Insect"
splist$Group[which(splist$Class == "Mammalia")] <- "Mammal"
splist$Group[which(splist$Class == "Arachnida")] <- "Spider"
splist$Kingdom[which(splist$Kingdom == "Viridiplantae")] <- "Plantae"
splist$Kingdom[which(splist$Phylum == "Tracheophyta")] <- "Plantae"
splist$Group[which(splist$Kingdom == "Plantae")] <- "Plant"
splist$Group[which(splist$Class == "Hydrozoa")] <- "Hydrozoa"
splist$Group[which(splist$Class == "Anthozoa")] <- "Sea anemones and corals"
splist$Group[which(splist$Class == "Polychaeta")] <- "Polychaetes"
splist$Group[which(splist$Phylum == "Mollusca")] <- "Molluscs"
splist$Group[which(splist$Class == "Malacostraca")] <- "Crustacean"
splist$Group[which(splist$Class == "Hexanauplia")] <- "Crustacean"
splist$Group[which(splist$Class == "Maxillopoda")] <- "Crustacean"
splist$Group[which(splist$Class == "Ostracoda")] <- "Crustacean"
splist$Group[which(splist$Class == "Branchiopoda")] <- "Crustacean"
splist$Group[which(splist$Class == "Asteroidea")] <- "Starfish"
splist$Group[which(splist$Class == "Ascidiacea")] <- "Ascidians tunicates and sea squirts"
splist$Class[which(splist$Class == "Actinopteri")] <- "Actinopterygii"
splist$Group[which(splist$Class == "Actinopterygii")] <- "Fish"
splist$Group[which(splist$Class == "Elasmobranchii")] <- "Fish"
splist$Group[which(splist$Order == "Perciformes")] <- "Fish"
splist$Group[which(splist$Class == "Chondrichthyes")] <- "Fish"
splist$Group[which(splist$Class == "Holocephali")] <- "Fish"
splist$Group[which(splist$Class == "Cephalaspidomorphi")] <- "Fish"
splist$Group[which(splist$Class == "Echinoidea")] <- "Sea urchin"
splist$Group[which(splist$Class == "Crinoidea")] <- "Crinoid"
splist$Group[which(splist$Class == "Holothuroidea")] <- "Sea cucumber"
splist$Group[which(splist$Class == "Reptilia")] <- "Reptile"
splist$Group[which(splist$Order == "Squamata")] <- "Reptile"
splist$Group[which(splist$Class == "Ophiuroidea")] <- "Brittle stars"
splist$Group[which(splist$Class == "Chilopoda")] <- "Centipedes"
splist$Group[which(splist$Class == "Diplopoda")] <- "Millipedes"
splist$Group[which(splist$Class == "Amphibia")] <- "Amphibian"
splist$Group[which(splist$Kingdom == "Fungi")] <- "Fungi"
splist$Group[which(splist$Order == "Balanomorpha")] <- "Barnacles"
splist$Group[which(splist$Phylum == "Nematoda")] <- "Nematodes"
splist$Group[which(splist$Class == "Myxini")] <- "Hagfish"
splist$Group[which(splist$Kingdom == "Chromista")] <- "Chromista"
splist$Family[which(splist$Genus == "Dendrocopus")] <- "Picidae"

######################################
biov1 <- merge(biov1[,-which(names(biov1) %in% c("Phylum","Class","Order","Family","Genus","Group","ECO"))], 
               splist[,c("Kingdom","Phylum","Class","Order","Family","Genus","Group","ECO","scientificName")], 
               by.x = "sp_name_std_v1", by.y = "scientificName", 
               all.x = T)

5.4 Filter LAT / ELE shifts

Use only latitudinal and elevational shifts

# v1
biov1 <- biov1 %>%
  dplyr::filter(Type %in% c("ELE","LAT"))  # Use LAT ELE shifts

# splist
sps <- unique(biov1$sp_name)
splist <- splist %>% dplyr::filter(scientificName %in% sps)

5.5 Remove species identified to the genus level or cf.

if(any(grep("sp[.]",biov1$sp_reported_name_v1))){
  biov1 <- biov1 %>% dplyr::filter(!grepl("sp[.]",sp_reported_name_v1))
}
if(any(grep("sp[.]",biov1$sp_name_std_v1))){
  biov1 <- biov1 %>% dplyr::filter(!grepl("sp[.]",sp_name_std_v1))
}
if(any(grep("cf[.]",biov1$sp_reported_name_v1))){
  biov1 <- biov1 %>% dplyr::filter(!grepl("cf[.]",sp_reported_name_v1))
}
if(any(grep("cf[.]",biov1$sp_name_std_v1))){
  biov1 <- biov1 %>% dplyr::filter(!grepl("cf[.]",sp_name_std_v1))
}

#remove freshwater fishes
biov1 <- biov1[-which((biov1$Class == "Actinopterygii" | biov1$Group == "Fish") & biov1$ECO=="T"),]

#remove marine birds
biov1 <- biov1[-which(biov1$Class == "Aves" & biov1$ECO=="M"),]


if(any(grep("sp[.]",splist$scientificName))){
  splist <- splist %>% dplyr::filter(!grepl("sp[.]",scientificName))
}
if(any(grep("sp[.]",splist$scientificName))){
  splist <- splist %>% dplyr::filter(!grepl("sp[.]",scientificName))
}
if(any(grep("cf[.]",splist$scientificName))){
  splist <- splist %>% dplyr::filter(!grepl("cf[.]",scientificName))
}
if(any(grep("cf[.]",splist$scientificName))){
  splist <- splist %>% dplyr::filter(!grepl("cf[.]",scientificName))
}

5.6 Reclassify methodological variables

unique(biov1$Data)
## [1] "abundance-based"  "occurrence-based"
biov1$Data[biov1$Data=="occurence-based"] <- "occurrence-based"
biov1$Data <- factor(biov1$Data, levels = unique(biov1$Data))
table(biov1$Data)
## 
##  abundance-based occurrence-based 
##             9421            20393
unique(biov1$Sampling)
## [1] "TWO"        "CONTINUOUS" "MULT"
biov1$Sampling = ifelse(biov1$Sampling %in% c("IRR","MULTIPLE(continuous)"),"MULTIPLE", biov1$Sampling)
biov1$Sampling <- ordered(biov1$Sampling, 
                          levels = c("TWO","MULTIPLE","CONTINUOUS"))
table(biov1$Sampling)
## 
##        TWO   MULTIPLE CONTINUOUS 
##      25466          0       3105
unique(biov1$Grain_size)
## [1] "small"      "large"      "moderate"   "very_large" NA
biov1$Grain_size <- ifelse(biov1$Grain_size %in% c("large","very_large"),"large",biov1$Grain_size)
biov1$Grain_size <- ordered(biov1$Grain_size, 
                            levels = c("small","moderate","large"))
table(biov1$Grain_size)
## 
##    small moderate    large 
##     9740    10657     9416
unique(biov1$Uncertainty_Distribution)
## [1] "RESAMPLING(same)"               "RAW"                           
## [3] "RESAMPLING"                     "MODEL"                         
## [5] "RESAMPLING+MODEL"               "MODEL+RESAMPLING(same)"        
## [7] "RESAMPLING(same)+DETECTABILITY" "RESAMPLING(same)+MODEL"        
## [9] "DETECTABILITY"
biov1$Uncertainty_Distribution <- ifelse(biov1$Uncertainty_Distribution %in% c("RESAMPLING","RESAMPLING(same)"),"RESAMPLING",
                                         ifelse(biov1$Uncertainty_Distribution %in% c("MODEL","MODEL+RESAMPLING(same)","RESAMPLING+MODEL"),"MODEL",
                                                ifelse(biov1$Uncertainty_Distribution %in% c("DETECTABILITY","RESAMPLING(same)+DETECTABILITY"),"DETECTABILITY",
                                                       biov1$Uncertainty_Distribution)))
biov1$Uncertainty_Distribution <- ifelse(biov1$Uncertainty_Distribution == "RAW","OPPORTUNISTIC","PROCESSED")  
table(biov1$Uncertainty_Distribution)
## 
## OPPORTUNISTIC     PROCESSED 
##          9449         20365
#transform study area
biov1$ID.area <- log(biov1$ID.area)

5.7 Get centroids of study areas

This will be used to merge genetic data with bioshifts based on distance of observations

# # The input file geodatabase
# fgdb <- "C:/Users/brunn/nextCloud/bioshifts_v1_v2/v1/Study_Areas_v1/Study_Areas.gdb"
# 
# # List all feature classes in a file geodatabase
# fc_list <- ogrListLayers(fgdb)
# 
# # Get centroids
# cl <- makeCluster(detectCores()-1)
# clusterExport(cl, c("readOGR", "gCentroid", "fgdb"))
# 
# centroids <- pblapply(fc_list, function(x){
#     fc <- readOGR(dsn=fgdb,layer=x)
#     c <- data.frame(gCentroid(fc))
#     names(c) <- c("centroid.x", "centroid.y")
# 
#     geomet <- data.frame(terra::geom(terra::vect(fc)))
# 
#     max_lat <- order(geomet[,"y"], decreasing = T)[1]
#     max_lat <- geomet[max_lat,c("x","y")]
#     names(max_lat) <- c("max_lat.x", "max_lat.y")
# 
#     min_lat <- order(geomet[,"y"])[1]
#     min_lat <- data.frame(geomet[min_lat,c("x","y")])
#     names(min_lat) <- c("min_lat.x", "min_lat.y")
# 
#     cbind(c, max_lat, min_lat)
# }, cl = cl)
# 
# stopCluster(cl)
# 
# centroids <- do.call("rbind",centroids)
# centroids$ID <- fc_list
# 
# write.csv(centroids, "Data/centroids_study_areas.csv", row.names = FALSE)

centroids <- read.csv(here::here("Data/centroids_study_areas.csv"))

biov1 <- merge(biov1, centroids, by = "ID")

6 Direction of shift

Identify direction of shift

biov1 <- biov1 %>%
  mutate(shift_sign = ifelse(SHIFT>0,"pos","neg"),
         shift_vel_sign = paste0(shift_sign,vel_sign)) %>%
  filter(!is.na(velocity))

ggplot(biov1, aes(x=shift_vel_sign))+
  geom_bar()+
  coord_flip()+
  facet_wrap(Type~Param)

summary(biov1$velocity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -5.8328  0.5415  2.0271  2.2860  2.7543 13.1259
summary(biov1$velocity[which(biov1$shift_vel_sign=="pospos")])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##  0.009396  0.541507  2.073300  2.470806  2.831932 13.125880
summary(biov1$velocity[which(biov1$shift_vel_sign=="negneg")])
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.79347 -2.30487 -1.78004 -1.59872 -0.58152 -0.02945

7 Calculate lags raw

# calculate lags
# positive values are a lag (range shift lower smaller than expected or in opposite directions) and negative values are range shift larger than expected
biov1$lag <- biov1$velocity-biov1$SHIFT
position = which(biov1$vel_sign == "neg")
biov1$lag[position] <- biov1$lag[position] * -1 # Any negative velocity means the sign of lag has to shift.

# Lag 2
# When shift is in opposite sign of velocity, lag = abs(velocity)
biov1$lag2 = biov1$lag
position <- which(biov1$shift_vel_sign == "posneg" | biov1$shift_vel_sign == "negpos")
biov1$lag2[position] <- abs(biov1$velocity[position])

# Lag 3
# When shift is > velocity, lag = 0
biov1$lag3 = biov1$lag2
position <- which((biov1$shift_sign == "pos" & biov1$SHIFT > biov1$velocity) |
                    (biov1$shift_sign == "neg" & biov1$SHIFT < biov1$velocity))
biov1$lag3[position] <- 0


{
  par(mfrow=c(2,2))
  plot(lag~velocity, biov1)
  plot(lag2~velocity, biov1)
  plot(lag3~velocity, biov1)
}

ggplot(biov1, aes(x = Param, y=lag, fill = Param, color = Param))+
  geom_boxplot(alpha = .5, outlier.shape = NA)+
  scale_y_continuous(limits = quantile(biov1$lag2, c(0.1, 0.9), na.rm = T))+
  facet_wrap(.~Type, scales = "free")
## Warning: Removed 8756 rows containing non-finite values (`stat_boxplot()`).
ggplot(biov1, aes(x = Param, y=lag2, fill = Param, color = Param))+
  geom_boxplot(alpha = .5, outlier.shape = NA)+
  scale_y_continuous(limits = quantile(biov1$lag2, c(0.1, 0.9), na.rm = T))+
  facet_wrap(.~Type, scales = "free")
## Warning: Removed 5850 rows containing non-finite values (`stat_boxplot()`).
ggplot(biov1, aes(x = Param, y=log1p(lag3), fill = Param, color = Param))+
  geom_boxplot(alpha = .5, outlier.shape = NA)+
  # scale_y_continuous(limits = quantile(log1p(biov1$lag3), c(0.1, 0.9), na.rm = T))+
  facet_wrap(.~Type, scales = "free")

8 All shifts vs same direction

ggplot(biov1, aes(x = velocity, y = SHIFT))+
  geom_point(aes(color = lag2), alpha = .1)+
  scale_color_viridis_c()+
  geom_smooth(method = "lm")+
  theme_classic()+
  facet_wrap(Param~Type+ECO)
## `geom_smooth()` using formula = 'y ~ x'

ggplot(biov1 %>%
         filter(shift_vel_sign == "pospos" | shift_vel_sign == "negneg"), 
       aes(x = abs(velocity), y = abs(SHIFT)))+
  geom_point(aes(color = lag2), alpha = .1)+
  scale_color_viridis_c()+
  geom_smooth(method = "lm")+
  theme_classic()+
  facet_wrap(Param~Type+ECO)
## `geom_smooth()` using formula = 'y ~ x'

9 Load genetic data

  • mt and ms are unpublished Malin’s data: These are mitocondrial (mt) or microsatelite (ms) data.

  • gen_d is from De Kort et al 2021 Citation: De Kort et al. 2021. Life history, climate and biogeography interactively affect worldwide genetic diversity of plant and animal populations. Nat Commun. This is population-level data for plants and animals. Genetic diversity was estimated from expected heterozygosity. They AFLP and microsatelite data, applying a harmonization approach for comparing both data.

  • gen_lf is from Lawrence et al. 2019 Citation: Lawrence, Elizabeth R; N. Benavente, Javiera; Matte, Jean-Michel; Marin, Kia; Wells, Zachery; Bernos, Thaïs A.; et al. (2019). MacroPopGen Database: Geo-referenced population-specific microsatellite data across the American continents. figshare. Dataset. This is population-level data using microsatellite data across the Americas.

mt <- fread(here::here('Data/Old/mtdna.csv'))
mt$spp <- Clean_Names(mt$spp)

ms <- fread(here::here('Data/Old/msat.csv'))
ms$spp <- Clean_Names(ms$spp)

gen_d <- fread(here::here('Data/Old/Deposited_data_genetic_diversity_dekort2021.csv'))
gen_d$spp <-  gsub("_"," ",gen_d$Species)
gen_d$spp <- Clean_Names(gen_d$spp)

gen_lf <- read_excel(here::here("Data/Old/MacroPopGen_Database_final-v0.2.xlsx"), sheet = "MacroPopGen_Database")
gen_lf$spp <-  gsub("_"," ",gen_lf$G_s)
gen_lf$spp <- Clean_Names(gen_lf$spp)

# De Kort
length(unique(gen_d$spp)) # 714

# Lawrence et al.
length(unique(gen_lf$spp)) # 895

# Malin mithocondrial
length(unique(mt$spp)) # 162

# Malin microsatelites
length(unique(ms$spp)) # 275

# N species with gen data
gen_sps <- unique(c(ms$spp,mt$spp,gen_lf$spp,gen_d$spp))
length(gen_sps) # 1893

9.1 Fix names at the genetic data

# Species to find
mycols <- c("species","scientificName","kingdom","phylum","class","order","family","db_code")
splist <- data.frame(matrix(ncol = length(mycols), nrow = length(unique(gen_sps))))
names(splist) <- mycols

splist$reported_name_fixed <- unique(gen_sps)
tofind_ <- splist[which(is.na(splist$scientificName)),]
tofind_ <- unique(tofind_$reported_name_fixed)

tofind <- data.frame(matrix(nrow = length(tofind_), ncol = 8))
names(tofind) = c("scientificName", "kingdom", "phylum", "class", "order", "family", "db", "db_code")

tofind <- data.frame(species = tofind_, tofind)

tofind <- tofind %>%
    mutate(across(everything(), as.character))

# retrieve sp names
sp_names_found <- Find_Sci_Names(spnames = tofind$species)

# Feed
tofind <- tofind %>% 
    rows_patch(sp_names_found, 
               by = "species")

gc()

## Add found species names to the splist
all(tofind$species %in% splist$reported_name_fixed)

if(any(is.na(tofind$scientificName))){
    found <- tofind[-which(is.na(tofind$scientificName)),]
} else {
    found = tofind
}

for(i in 1:length(found$species)){
    tofill <- unique(which(splist$reported_name_fixed == found$species[i]))
    splist$scientificName[tofill] <- found$scientificName[i]
    splist$kingdom[tofill] <- found$kingdom[i]
    splist$phylum[tofill] <-found$phylum[i]
    splist$class[tofill] <- found$class[i]
    splist$order[tofill] <- found$order[i]
    splist$family[tofill] <- found$family[i]
    splist$db[tofill] <- found$db[i]
    splist$db_code[tofill] <- found$db_code[i]
}

splist$spp = splist$reported_name_fixed

# Keep original names for those species we could not find a name at GBIF
pos <- which(is.na(splist$scientificName))
splist$scientificName[pos] <- splist$spp[pos]

any(!mt$spp %in% gen_sps)
any(!mt$spp %in% splist$spp)

any(!ms$spp %in% gen_sps)
any(!ms$spp %in% splist$spp)

any(!gen_d$spp %in% gen_sps)
any(!gen_d$spp %in% splist$spp)

any(!gen_lf$spp %in% gen_sps)
any(!gen_lf$spp %in% splist$spp)

for(i in 1:nrow(mt)){
    mt$spp_new[i] <- splist$scientificName[which(splist$spp == mt$spp[i])]
}
for(i in 1:nrow(ms)){
    ms$spp_new[i] <- splist$scientificName[which(splist$spp == ms$spp[i])]
}
for(i in 1:nrow(gen_d)){
    gen_d$spp_new[i] <- splist$scientificName[which(splist$spp == gen_d$spp[i])]
}
for(i in 1:nrow(gen_lf)){
    gen_lf$spp_new[i] <- splist$scientificName[which(splist$spp == gen_lf$spp[i])]
}

write.csv(mt, 
          here::here('Data/mtdna_harmo.csv'), row.names = FALSE)
write.csv(ms, 
          here::here('Data/msat_harmo.csv'), row.names = FALSE)
write.csv(gen_d, 
          here::here('Data/Deposited_data_genetic_diversity_dekort2021_harmo.csv'), row.names = FALSE)
write.csv(gen_lf, 
          here::here('Data/MacroPopGen_Database_final_areas_Lawrence_Fraser_2020_harmo.csv'), row.names = FALSE)

10 Load harmonized genetic data

mt <- fread(here::here('Data/Old/mtdna_harmo.csv'))
ms <- fread(here::here('Data/Old/msat_harmo.csv'))
gen_d <- fread(here::here('Data/Old/Deposited_data_genetic_diversity_dekort2021_harmo.csv'))
gen_d$spp <-  gsub("_"," ",gen_d$Species)
gen_lf <- fread(here::here('Data/Old/MacroPopGen_Database_final_areas_Lawrence_Fraser_2020_harmo.csv'))
gen_lf$spp <-  gsub("_"," ",gen_lf$G_s)

10.1 Stats

# update new names
mt$spp <- mt$spp_new
ms$spp <- ms$spp_new
gen_d$spp <- gen_d$spp_new
gen_lf$spp <- gen_lf$spp_new

# N species with Gen data in bioshifts
gen_sps <- unique(c(mt$spp, ms$spp, gen_d$spp, gen_lf$spp))

length(which(gen_sps %in% unique(biov1$spp))) # 318
## [1] 318
# Malin's mt in bioshifts
length(which(unique(mt$spp) %in% unique(biov1$spp))) # 32
## [1] 32
# Malin's ms in bioshifts
length(which(unique(ms$spp) %in% unique(biov1$spp))) # 53
## [1] 53
# De Kort et al. in bioshifts
length(which(unique(gen_d$spp) %in% unique(biov1$spp))) # 148
## [1] 148
# Lawrance et al. ms in bioshifts
length(which(unique(gen_lf$spp) %in% unique(biov1$spp))) # 132
## [1] 132

10.2 Filter species in Bioshifts

gen_sps <- gen_sps[which(gen_sps %in% unique(biov1$spp))]

mt <- mt %>% filter(spp %in% gen_sps)
ms <- ms %>% filter(spp %in% gen_sps)
gen_d <- gen_d %>% filter(spp %in% gen_sps)
gen_lf <- gen_lf %>% filter(spp %in% gen_sps)

10.3 Explore distribution

# Malin
ggplot(mt, aes(He)) +
    geom_histogram() +
    ggtitle("Malin's Mitochondrial")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8 rows containing non-finite values (`stat_bin()`).

ggplot(ms, aes(He)) +
    geom_histogram() +
    ggtitle("Malin's Microsatellite")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# De Kort
ggplot(gen_d, aes(GDp)) +
    geom_histogram() +
    ggtitle("De Kort AFLP & Microsatellite")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Why De Kort data has the two peaks? It is due to the marker type
ggplot(gen_d, aes(GDp)) +
    geom_histogram()+
    facet_grid(.~Marker)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# But if we look at the normalized GDp, the problem is solved. Therefore, we should be using the normalized version of GDp from DeKort
ggplot(gen_d, aes(GDp_norm)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Lawrence & Fraser
ggplot(gen_lf, aes(as.numeric(He))) +
    geom_histogram()+
    ggtitle("Lawrence & Fraser Microsatellite")
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 309 rows containing non-finite values (`stat_bin()`).

11 Create a single genetic diversity dataset

# Malin
mt_sub <- mt  %>%
    mutate(long = as.numeric(lon),
           lat = as.numeric(lat),
           N = as.numeric(n), # Sample size
           Marker = "Mitochondrial")  %>%
    dplyr::filter(!is.na(He) | !He==0) %>%
    dplyr::select(spp, He, long, lat, Marker)
mt_sub$Source = "Malin"
mt_sub$Het_type = "He"

ms_sub <- ms %>%
    mutate(long = as.numeric(lon),
           lat = as.numeric(lat),
           N = as.numeric(n), # Sample size
           Marker = "Microsatellite")  %>%
    dplyr::filter(!is.na(He) | !He==0) %>%
    dplyr::select(spp, He, long, lat, Marker)
ms_sub$Source = "Malin"
ms_sub$Het_type = "He"

###################
# De Kort

# Jonathan R.:
# co-dominant refers to the fact that you can distinguish homozigous and heterozygous individuals, such as microsatellites
# dominant means that you either have the detection of the marker or not. but you don’t know if the individual is heterozygous or not, such as AFLP
# restriction enzymes are proteins that you use to cut the genomes into fragments. there are used for AFLP and allozymes, so not sure

# De Kort code for Marker:
# CD = Co-dominant
# D = Dominant
# ENZ = Enzime

gen_d_sub <- gen_d %>%
    mutate(long = as.numeric(LONGITUDE),
           lat = as.numeric(LATITUDE),
           spp = spp, 
           He = as.numeric(GDp),
           N = as.numeric(SampleSize), # Sample size
           Marker = case_when(
               Marker=="CD" ~ "Microsatellite",
               Marker=="D" ~ "AFLP",
               Marker=="ENZ" ~ "AFLP",
               TRUE ~ as.character(Marker))) %>%
    dplyr::filter(!is.na(He) | !He==0) %>%
    dplyr::filter(!(Marker == "AFLP" & He > .5)) %>% # AFLPs > 0.5 are errors
    dplyr::select(spp, He, long, lat, Marker)
gen_d_sub$Source = "De Kort"
gen_d_sub$Het_type = "He"

# How DeKort calculated the normalized version of GDp
# gen_d_sub$He_harm <- NA
# 
# m <- unique(gen_d_sub$Marker)
# for(i in 1:length(m)){
# pos <- which(gen_d_sub$Marker == m[i])
# gen_d_sub$He_harm[pos] <-  scale(gen_d_sub$He[pos])
# }
# gen_d_sub$He_harm <-  (gen_d_sub$He_harm - min(gen_d_sub$He_harm))/(max(gen_d_sub$He_harm)-min(gen_d_sub$He_harm))

###################
# Lawrence & Fraser
pos <- which(is.na(gen_lf$He))
gen_lf$He_new <- gen_lf$He
gen_lf$He_new[pos] <- gen_lf$Ho[pos]
gen_lf$Het_type = "He"
gen_lf$Het_type[pos] = "Ho"

gen_lf_sub <- gen_lf %>%
    mutate(long = as.numeric(Long),
           lat = as.numeric(Lat),
           He = as.numeric(He_new),
           spp = spp,
           N = as.numeric(n), # Sample size
           Marker = "Microsatellite") %>% 
    dplyr::filter(!is.na(He) | !He==0) %>%
    dplyr::select(spp, He, long, lat, Marker, Het_type) 
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `He = as.numeric(He_new)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
gen_lf_sub$Source = "Lawrence & Fraser"

###################
# Group all
gen_div <- rbind(mt_sub %>% dplyr::select(names(mt_sub)), 
                 ms_sub %>% dplyr::select(names(mt_sub)),
                 gen_d_sub %>% dplyr::select(names(mt_sub)),
                 gen_lf_sub %>% dplyr::select(names(mt_sub)))
gen_div <- na.omit(gen_div)

gen_div$spp <- as.factor(gen_div$spp)
gen_div$Marker <- as.factor(gen_div$Marker)
gen_div$Source <- as.factor(gen_div$Source)

length(unique(gen_div$spp))
## [1] 310

12 Harmonize genetic data

  • He_harm = transformation used in DeKort et al 2021 (Nature Communications). Makes markers range from 0-1.
  1. scale values at each marker type
  2. standardize (x - min)/(max-min)
  • He_harm2 = transformation suggested by Malin. Applies the function transform01 before a logit transformation.
  1. transform01 >> The only intent of this function is to move slightly inward the values of He so that it allows applying logit transformation.
  2. apply logit transformation
  • He_harm3 = He_harm2 centered in zero.
gen_div$He_harm <- NA
gen_div$He_harm2 <- NA
gen_div$He_harm3 <- NA

m <- unique(gen_div$Marker)
for(i in 1:length(m)){
    # Select marker type
    pos <- which(gen_div$Marker == m[i])
    # Apply DeKort transformation
    gen_div$He_harm[pos] <- deKort_trans(gen_div$He[pos])
    # Apply logit transformation
    gen_div$He_harm2[pos] <- logit_trans(gen_div$He[pos])
    # Centered in zero
    gen_div$He_harm3[pos] <- scale(gen_div$He_harm2[pos])
}

length(unique(gen_div$spp))
## [1] 310
# remove outliers
gen_metrics <- c("He_harm","He_harm2","He_harm3")

for (i in 1:length(gen_metrics)){
    my_gem <- as.numeric(data.frame(gen_div %>% select(gen_metrics[i]))[,1])
    
    quartiles <- quantile(my_gem, probs=c(.05, .95), na.rm = FALSE)
    IQR <- IQR(my_gem)
    
    Lower <- quartiles[1] - 1.5*IQR
    Upper <- quartiles[2] + 1.5*IQR 
    
    gen_div <- subset(gen_div, my_gem > Lower & my_gem < Upper)
}

length(unique(gen_div$spp))
## [1] 309

12.1 Avg by location

We averaged genetic diversity by location (same species, in the same XY coordinates)

# How many species with >1 gen div measurements at the same site?
# Check how many obs per site and marker type exist
N_obs <- gen_div %>%
    group_by(spp, long, lat) %>%
    tally() %>%
    dplyr::filter(n>1)

DT::datatable(N_obs)
# Is there any species with >1 marker type in the same location?
gen_div %>%
    group_by(spp, long, lat, Marker) %>%
    dplyr::summarise(N = length(unique(spp))) %>%
    dplyr::filter(N>1)
## `summarise()` has grouped output by 'spp', 'long', 'lat'. You can override
## using the `.groups` argument.
## # A tibble: 0 × 5
## # Groups:   spp, long, lat [0]
## # ℹ 5 variables: spp <fct>, long <dbl>, lat <dbl>, Marker <fct>, N <int>
# No, there are not

# Is there any species with >1 Source type in the same location?
gen_div %>%
    group_by(spp, long, lat, Source) %>%
    dplyr::summarise(N = length(unique(spp))) %>%
    dplyr::filter(N>1)
## `summarise()` has grouped output by 'spp', 'long', 'lat'. You can override
## using the `.groups` argument.
## # A tibble: 0 × 5
## # Groups:   spp, long, lat [0]
## # ℹ 5 variables: spp <fct>, long <dbl>, lat <dbl>, Source <fct>, N <int>
# No, there are not

# Avg by site and species
gen_div <- gen_div %>%
    group_by(spp, long, lat) %>%
    dplyr::summarise(He = median(He),
                     He_harm = median(He_harm),
                     He_harm2 = median(He_harm2),
                     He_harm3 = median(He_harm3),
                     Source = paste(Source, sep = ", "),
                     Marker = paste(Marker, sep = ", "))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'spp', 'long', 'lat'. You can override
## using the `.groups` argument.
unique(gen_div$Source)
## [1] "De Kort"           "Lawrence & Fraser" "Malin"
unique(gen_div$Marker)
## [1] "AFLP"           "Microsatellite" "Mitochondrial"
length(unique(gen_div$spp))
## [1] 309

12.2 Which species?

gen_div <- merge(gen_div, 
                 splist[,-7],
                 by.x="spp",
                 by.y="scientificName",
                 all.x = T)

toplot_ <- gen_div %>%
    group_by(Group) %>%
    dplyr::summarise(Obs = length(spp),
                     Species = length(unique(spp))) %>%
    gather(var, N, -c(Group))

ggplot(toplot_, aes(x = Group, y = N))+
    ggtitle("N species by Group")+
    geom_bar(stat="identity")+
    geom_text(aes(y=N, label=N, 
                  hjust = ifelse(N<max(N),-.1,1.1)), vjust=0.2, size=3, 
              position = position_dodge(0.9))+
    theme_classic()+
    coord_flip()+
    facet_wrap(.~var, scales = "free", ncol=1)

12.3 Distribution of He

# correlations
ggpairs(gen_div,
        columns = c("He","He_harm","He_harm2","He_harm3"))

ggpairs(gen_div,
        columns = c("He","He_harm","He_harm2","He_harm3"),      
        aes(color = Marker,  
            alpha = 0.5))     

ggpairs(gen_div,
        columns = c("He","He_harm","He_harm2","He_harm3"),      
        aes(color = Source,  
            alpha = 0.5))

to_plot <- gen_div %>%
    select(Group,He_harm,He_harm2,He_harm3) %>%
    filter(Group %in% c("Mammal","Fish","Plant", "Bird")) %>%
    gather(metric, value, -Group)

ggplot(to_plot, aes(value, fill = Group, color = Group))+
    geom_density(alpha = .3)+
    scale_color_viridis_d()+
    scale_fill_viridis_d()+
    facet_wrap(.~metric, scales = "free")

12.4 High He with latitude?

12.4.1 Absolution latitude

toplot <- gen_div %>%
    mutate(Hemisphere = ifelse(lat<0, "South", "North"),
           lat_abs = abs(lat),
           He_harm = scale(He_harm),
           He_harm3 = scale(He_harm3))

ggplot(toplot, aes(x = lat_abs, y = He_harm, color = Group))+
    geom_point(alpha = .1)+
    stat_smooth(method = "lm")+
    facet_wrap(.~Group)
## `geom_smooth()` using formula = 'y ~ x'

mod1 <- lm(He_harm~lat*Group, toplot)
emtrends(mod1, ~ lat*Group, var = "lat")
##   lat Group     lat.trend       SE   df lower.CL upper.CL
##  47.3 Amphibian  -0.06976 0.006401 8301 -0.08230 -0.05721
##  47.3 Bird        0.00373 0.001371 8301  0.00104  0.00642
##  47.3 Fish        0.01043 0.000788 8301  0.00889  0.01198
##  47.3 Mammal     -0.01131 0.003332 8301 -0.01784 -0.00478
##  47.3 Molluscs    0.06839 0.232621 8301 -0.38761  0.52438
##  47.3 Plant      -0.01923 0.001149 8301 -0.02148 -0.01697
##  47.3 Reptile     0.07798 0.023030 8301  0.03284  0.12313
## 
## Confidence level used: 0.95

12.5 Spatial pattern

mundi <- rnaturalearth::ne_coastline(scale = 110, returnclass = "sv")
mundi <- crop(mundi, ext(c(-180,180,-60,90)))

# get vect gen div
vect_div <- terra::vect(gen_div %>%
                          dplyr::filter(spp %in% biov1$sp_name_std_v1),
                        geom=c("long","lat"),crs=crs(mundi))

# get vect centroids bioshifts
vect_biov1 <- terra::vect(biov1 %>%
                            filter(spp %in% gen_div$spp),
                          geom=c("centroid.x","centroid.y"),crs=crs(mundi))

# get raster bioshifts shp files study areas
get_raster_bioshifts = "NO"
if(get_raster_bioshifts=="YES"){
  my_ext = terra::ext(mundi)
  my_crs = crs(mundi)
  rast_biov1 <- terra::rast(my_ext, crs = my_crs, res = 0.5)
  values(rast_biov1) <- 0
  
  fgdb <- "C:/Users/brunn/ShadowDrive/CreateGeodatabaseBioShifts/Data/Study_Areas_v1/Study_Areas.gdb"
  fc_list <- terra::vector_layers(fgdb)
  fc_list <- fc_list[which(fc_list %in% unique(as.data.frame(vect_biov1)$ID))]
  
  for(i in 1:length(fc_list)){ cat("\r",i,"from",length(fc_list))
    tmp = terra::vect(fgdb, layer = fc_list[i])
    tmp = terra::cells(rast_biov1,tmp)
    tmp_cell = tmp[,2]
    tmp_vals = rast_biov1[tmp_cell][,1]
    rast_biov1[tmp_cell] = tmp_vals+1
  }
  names(rast_biov1) <- "SA"
  rast_biov1[rast_biov1==0] <- NA
  
  writeRaster(rast_biov1, here::here("Data/raster_bioshifts_SA.tif"), overwrite = TRUE)
  
} else {
  rast_biov1 <- terra::rast(here::here("Data/raster_bioshifts_SA.tif"))
  rast_biov1 <- crop(rast_biov1, ext(mundi))
}

values_biov1 <- na.omit(values(rast_biov1))

plot_data <- gen_div %>%
  dplyr::filter(spp %in% biov1$sp_name_std_v1) %>%
  mutate(x = round(long,3),
         y = round(lat,3))

gen_fig <- ggplot()+
  geom_spatvector(data=mundi)+
  geom_hex(data=plot_data,
           binwidth = c(4, 4),
           aes(x=x, y=y))+
  scale_fill_continuous(trans="log", 
                        alpha=.7,
                        labels=function(x) round(x,0),
                        type = "viridis", name = "Species with\ngenetic data")+
  theme_void()

gen_fig +
  ggtitle("Genetic diversity data")

bio_fig <- ggplot() +
  geom_spatraster(data=rast_biov1)+
  scale_fill_whitebox_c(
    name = "BioShifts\nstudy areas",
    palette = "viridi",
    alpha=.7,
    na.value = "white",
    trans="log", 
    labels=function(x) round(x,0),
    breaks = seq(min(values_biov1), max(values_biov1), length.out = 5))+
  geom_spatvector(data=mundi)+
  theme_void()

bio_fig +
  ggtitle("Bioshifts study areas")

bio_gen_fig <- bio_fig +
  geom_spatvector(data=vect_div, alpha = .3)

bio_gen_fig +
  ggtitle("Genetic diversity data & BioShifts study areas")

ggsave(filename = here::here("Figs/Map_pop-level_GD.pdf"), 
       plot = bio_gen_fig, 
       width = 12,
       height = 5,
       device = "pdf", 
       units = "in")
bio_gen_fig2 <- ggpubr::ggarrange(gen_fig,
                                  bio_fig,
                                  labels = c("A", "B"),
                                  ncol=1)

bio_gen_fig2

ggsave(filename = here::here("Figs/Map_pop-level_GD2.pdf"), 
       plot = bio_gen_fig2, 
       width = 10,
       height = 8,
       device = "pdf", 
       units = "in")

13 Merge genetic and bioshifts data

We tried several methods for merging genetic data and bioshifts. We decided that Method 5 is the best one.

14 Method 1 - Normal merge

This is not the correct way to merge because it creates multiple false duplicates.

15 Method 2 - Median He per species

This ignores spatial proximity of He values to range shift values.

16 Method 3 - Median He per Param & species

Use lower latitude TE and higher latitude as LE (North Hemisphere).
Although this collects He values based on latitude, the proximity of He to the study areas where range shifts were observed is ignored.

17 Method 4 - Based on distance

Calculate distance weighted mean He for each species in the bioshifts database.
This method collects He values more close to the centroid of the study area.

18 Method 5 - Based on location

Get the closest He to the centroid (for O), to the max latitude (for LE, at the North hemisphere), and to the min latitude (for TE, at the North hemisphere) of the study area.

# sp list
m <- unique(biov1$spp[which(biov1$spp %in% gen_div$spp)])

gen_data_v1_dist2 <- data.frame()

# sp2go="Alces alces"
# sp2go="Sylvia atricapilla"
for(i in 1:length(m)){ cat(i, "from", length(m), "\r")
    sp2go <- m[i]
    
    # subset genetic data
    sub_gen <- subset(gen_div[,1:9], spp == sp2go)
    sub_gen_v <- terra::vect(sub_gen, 
                             geom=c("long", "lat"), 
                             crs = "+proj=longlat +datum=WGS84 +no_defs")
    
    # subset bioshifts data
    sub_bio <- subset(biov1, spp == sp2go)
    
    # study areas for species i
    areas <- unique(sub_bio$ID)
    
    gen_data_v1_dist2_tmp <- data.frame()
    
    # get genetic data for species i in each study area
    for(j in 1:nrow(sub_bio)){
        
        # bioshifts in study area j and range pos r
        sub_bio_area_j_tmp <- sub_bio[j,]
        
        range_pos2go <- sub_bio_area_j_tmp$Param
        
        coord_names <- NA
        coord_names <- if(range_pos2go == "O"){ c("centroid.x", "centroid.y") } else {coord_names}
        coord_names <- if(range_pos2go == "LE" & sub_bio_area_j_tmp$centroid.y > 0){ c("max_lat.x", "max_lat.y") } else {coord_names}
        coord_names <- if(range_pos2go == "LE" & sub_bio_area_j_tmp$centroid.y < 0){ c("min_lat.x", "min_lat.y") } else {coord_names}
        coord_names <- if(range_pos2go == "TE" & sub_bio_area_j_tmp$centroid.y < 0){ c("max_lat.x", "max_lat.y") } else {coord_names}
        coord_names <- if(range_pos2go == "TE" & sub_bio_area_j_tmp$centroid.y > 0){ c("min_lat.x", "min_lat.y") } else {coord_names}
        
        sub_bio_area_j_v = data.frame(sub_bio_area_j_tmp)[,coord_names]
        names(sub_bio_area_j_v) <- c("long", "lat")
        
        sub_bio_area_j_v <- terra::vect(unique(sub_bio_area_j_v[,c("long", "lat")]), 
                                        geom=c("long", "lat"), 
                                        crs = "+proj=longlat +datum=WGS84 +no_defs")
        
        # plot(terra::vect(rbind(sub_gen[,c("long", "lat")], sub_bio_area_j_tmp[,c("long", "lat")]),
        #  geom=c("long", "lat"),
        #  crs = "+proj=longlat +datum=WGS84 +no_defs"))
        # plot(sub_bio_area_j_v, col = "red", add=T)
        
        dists <- terra::distance(sub_bio_area_j_v, sub_gen_v)[1,]
        dists_order <- order(dists)[1]
        min_dist <- dists[dists_order] # distance to closest
        
        # the closest
        sub_gen_tmp <- sub_gen[dists_order,]
        sub_gen_tmp$min_dist <- min_dist
        
        # weighted distance
        sub_gen_tmp$He_harm_w <- weighted.mean(sub_gen$He_harm, 1/(dists^2)) # inverse of the distance
        sub_gen_tmp$He_harm2_w <- weighted.mean(sub_gen$He_harm2, 1/(dists^2)) # inverse of the distance
        sub_gen_tmp$He_harm3_w <- weighted.mean(sub_gen$He_harm3, 1/(dists^2)) # inverse of the distance
        
        gen_data_v1_dist2_tmp <- rbind(gen_data_v1_dist2_tmp,
                                       append(list(sub_gen_tmp), 
                                              list(sub_bio_area_j_tmp)) %>%
                                           reduce(full_join, by="spp"))
        
    }
    
    gen_data_v1_dist2 <- rbind(gen_data_v1_dist2,
                               gen_data_v1_dist2_tmp)
    
}

gen_data_v1_dist2 <- gen_data_v1_dist2 %>% dplyr::filter(!is.na(He), !is.na(SHIFT))

19 Save data

write.csv(gen_data_v1_dist2,here::here("Data/gen_data_final_distance_pop-level.csv"),row.names = FALSE)

20 Explore data

20.1 The closest He vs distance weighted He

plot(gen_data_v1_dist2$He_harm,gen_data_v1_dist2$He_harm_w)

plot(gen_data_v1_dist2$He_harm2,gen_data_v1_dist2$He_harm2_w)

plot(gen_data_v1_dist2$He_harm3,gen_data_v1_dist2$He_harm3_w)

20.2 Compare shift values of the full bioshifts vs the subset (genetic) dataset

toplot <- rbind(
    data.frame(dataset = "Full", 
               lags = biov1$lag2, 
               shifts = biov1$SHIFT),
    data.frame(dataset = "Subset", 
               lags = gen_data_v1_dist2$lag2, 
               shifts = gen_data_v1_dist2$SHIFT))

# shift
ggplot(toplot, aes(x = dataset, y=shifts, fill = dataset, color = dataset))+
    geom_boxplot(alpha = .5, outlier.shape = NA)+
    scale_y_continuous(limits = quantile(toplot$shifts, c(0.1, 0.9), na.rm = T))
## Warning: Removed 6336 rows containing non-finite values (`stat_boxplot()`).

tapply(toplot$shifts, toplot$dataset, summary)
## $Full
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -119.1696   -0.4468    0.3056    1.1671    2.4345  146.3000 
## 
## $Subset
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -119.1696   -0.3768    0.1444    1.0269    1.9134   39.3400
mod1 <- aov(shifts~dataset, data = toplot)
summary(mod1)
##                Df Sum Sq Mean Sq F value Pr(>F)
## dataset         1     34   34.45   1.132  0.287
## Residuals   31673 963794   30.43

There are difference in mean values of lags, but no difference in mean shift values, between the full and subset datasets.

20.3 SHIFT

20.3.1 General pattern

20.3.1.1 He_harm

gen_data_v1_dist2 %>%
    dplyr::filter(Type == "LAT",
                  shift_vel_sign == "pospos") %>%
    ggplot(aes(x = He_harm, y = (abs(SHIFT)), color = abs(velocity)))+
    ggtitle("Latitude")+
    xlab("Genetic diversity (He harm)")+
    ylab("Range shift (km/year)") +
    scale_color_viridis_c(direction = -1)+
    geom_point(alpha = .5, size = 3)+
    theme_bw()+
    geom_smooth(method = "lm", color = "black", size=1)+
    stat_smooth(geom="line", method = "lm", alpha=0.3, se = FALSE, 
                aes(by=as.factor(Article_ID)), color = "black")

20.3.1.2 He_harm2

gen_data_v1_dist2 %>%
    dplyr::filter(Type == "LAT",
                  shift_vel_sign == "pospos") %>%
    ggplot(aes(x = He_harm2, y = (abs(SHIFT)), color = abs(velocity)))+
    ggtitle("Latitude")+
    xlab("Genetic diversity (He harm)")+
    ylab("Range shift (km/year)") +
    scale_color_viridis_c(direction = -1)+
    geom_point(alpha = .5, size = 3)+
    theme_bw()+
    geom_smooth(method = "lm", color = "black", size=1)+
    stat_smooth(geom="line", method = "lm", alpha=0.3, se = FALSE, 
                aes(by=as.factor(Article_ID)), color = "black")

20.3.1.3 He_harm3

gen_data_v1_dist2 %>%
    dplyr::filter(Type == "LAT",
                  shift_vel_sign == "pospos") %>%
  ggplot(aes(x = He_harm3, y = (abs(SHIFT)), color = abs(velocity)))+
    ggtitle("Latitude")+
    xlab("Genetic diversity (He harm)")+
    ylab("Range shift (km/year)") +
    scale_color_viridis_c(direction = -1)+
    geom_point(alpha = .5, size = 3)+
    theme_bw()+
    geom_smooth(method = "lm", color = "black", size=1)+
    stat_smooth(geom="line", method = "lm", alpha=0.3, se = FALSE, 
                aes(by=as.factor(Article_ID)), color = "black")

20.3.2 By Param

20.3.2.1 He_harm

gen_data_v1_dist2 %>%
    dplyr::filter(Type == "LAT",
                  shift_vel_sign == "pospos") %>%
  ggplot(aes(x = He_harm, y = (abs(SHIFT)), color = abs(velocity)))+
    ggtitle("Latitude")+
    xlab("Genetic diversity (He harm)")+
    ylab("Range shift (km/year)") +
    scale_color_viridis_c(direction = -1)+
    geom_point(alpha = .5, size = 3)+
    theme_bw()+
    geom_smooth(method = "lm", color = "black", size=1)+
    stat_smooth(geom="line", method = "lm", alpha=0.3, se = FALSE, 
                aes(by=as.factor(Article_ID)), color = "black")+
    facet_wrap(.~Param, scales = "free", ncol = 3)

20.3.2.2 He_harm2

gen_data_v1_dist2 %>%
    dplyr::filter(Type == "LAT",
                  shift_vel_sign == "pospos") %>%
  ggplot(aes(x = He_harm2, y = (abs(SHIFT)), color = abs(velocity)))+
    ggtitle("Latitude")+
    xlab("Genetic diversity (He harm logit)")+
    ylab("Range shift (km/year)") +
    scale_color_viridis_c(direction = -1)+
    geom_point(alpha = .5, size = 3)+
    theme_bw()+
    geom_smooth(method = "lm", color = "black", size=1)+
    stat_smooth(geom="line", method = "lm", alpha=0.3, se = FALSE, 
                aes(by=as.factor(Article_ID)), color = "black")+
    facet_wrap(.~Param, scales = "free", ncol = 3)

20.3.2.3 He_harm3

gen_data_v1_dist2 %>%
    dplyr::filter(Type == "LAT",
                  shift_vel_sign == "pospos") %>%
  ggplot(aes(x = He_harm3, y = (abs(SHIFT)), color = abs(velocity)))+
    ggtitle("Latitude")+
    xlab("Genetic diversity (He harm logit)")+
    ylab("Range shift (km/year)") +
    scale_color_viridis_c(direction = -1)+
    geom_point(alpha = .5, size = 3)+
    theme_bw()+
    geom_smooth(method = "lm", color = "black", size=1)+
    stat_smooth(geom="line", method = "lm", alpha=0.3, se = FALSE, 
                aes(by=as.factor(Article_ID)), color = "black")+
    facet_wrap(.~Param, scales = "free", ncol = 3)
## Warning in stat_smooth(geom = "line", method = "lm", alpha = 0.3, se = FALSE, :
## Ignoring unknown aesthetics: by
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

20.3.3 By Group

20.3.3.1 He_harm

gen_data_v1_dist2 %>%
    dplyr::filter(Type == "LAT",
                  shift_vel_sign == "pospos") %>%
  ggplot(aes(x = He_harm, y = (abs(SHIFT)), color = abs(velocity)))+
    ggtitle("Latitude")+
    xlab("Genetic diversity (He harm)")+
    ylab("Range shift (km/year)") +
    scale_color_viridis_c(direction = -1)+
    geom_point(alpha = .5, size = 3)+
    theme_bw()+
    geom_smooth(method = "lm", color = "black", size=1)+
    stat_smooth(geom="line", method = "lm", alpha=0.3, se = FALSE, 
                aes(by=as.factor(Article_ID)), color = "black")+
    facet_wrap(.~Group, scales = "free", ncol = 3)

20.3.3.2 He_harm2

gen_data_v1_dist2 %>%
    dplyr::filter(Type == "LAT",
                  shift_vel_sign == "pospos") %>%
  ggplot(aes(x = He_harm2, y = (abs(SHIFT)), color = abs(velocity)))+
    ggtitle("Latitude")+
    xlab("Genetic diversity (He harm logit)")+
    ylab("Range shift (km/year)") +
    scale_color_viridis_c(direction = -1)+
    geom_point(alpha = .5, size = 3)+
    theme_bw()+
    geom_smooth(method = "lm", color = "black", size=1)+
    stat_smooth(geom="line", method = "lm", alpha=0.3, se = FALSE, 
                aes(by=as.factor(Article_ID)), color = "black")+
    facet_wrap(.~Group, scales = "free", ncol = 3)

20.3.3.3 He_harm3

gen_data_v1_dist2 %>%
    dplyr::filter(Type == "LAT",
                  shift_vel_sign == "pospos") %>%
  ggplot(aes(x = He_harm3, y = (abs(SHIFT)), color = abs(velocity)))+
    ggtitle("Latitude")+
    xlab("Genetic diversity (He harm logit)")+
    ylab("Range shift (km/year)") +
    scale_color_viridis_c(direction = -1)+
    geom_point(alpha = .5, size = 3)+
    theme_bw()+
    geom_smooth(method = "lm", color = "black", size=1)+
    stat_smooth(geom="line", method = "lm", alpha=0.3, se = FALSE, 
                aes(by=as.factor(Article_ID)), color = "black")+
    facet_wrap(.~Group, scales = "free", ncol = 3)
## Warning in stat_smooth(geom = "line", method = "lm", alpha = 0.3, se = FALSE, :
## Ignoring unknown aesthetics: by
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

20.3.4 By Group * Param

20.3.4.1 He_harm

gen_data_v1_dist2 %>%
    dplyr::filter(Type == "LAT",
                  shift_vel_sign == "pospos") %>%
  ggplot(aes(x = He_harm, y = (abs(SHIFT)), color = abs(velocity)))+
    ggtitle("Latitude")+
    xlab("Genetic diversity (He harm)")+
    ylab("Range shift (km/year)") +
    scale_color_viridis_c(direction = -1)+
    geom_point(alpha = .5, size = 3)+
    theme_bw()+
    geom_smooth(method = "lm", color = "black", size=1)+
    stat_smooth(geom="line", method = "lm", alpha=0.3, se = FALSE, 
                aes(by=as.factor(Article_ID)), color = "black")+
    facet_wrap(Group~Param, scales = "free")

20.3.4.2 He_harm2

gen_data_v1_dist2 %>%
    dplyr::filter(Type == "LAT",
                  shift_vel_sign == "pospos") %>%
  ggplot(aes(x = He_harm2, y = (abs(SHIFT)), color = abs(velocity)))+
    ggtitle("Latitude")+
    xlab("Genetic diversity (He harm logit)")+
    ylab("Range shift (km/year)") +
    scale_color_viridis_c(direction = -1)+
    geom_point(alpha = .5, size = 3)+
    theme_bw()+
    geom_smooth(method = "lm", color = "black", size=1)+
    stat_smooth(geom="line", method = "lm", alpha=0.3, se = FALSE, 
                aes(by=as.factor(Article_ID)), color = "black")+
    facet_wrap(Group~Param, scales = "free")

20.3.4.3 He_harm3

gen_data_v1_dist2 %>%
    dplyr::filter(Type == "LAT",
                  shift_vel_sign == "pospos") %>%
  ggplot(aes(x = He_harm3, y = (abs(SHIFT)), color = abs(velocity)))+
    ggtitle("Latitude")+
    xlab("Genetic diversity (He harm logit)")+
    ylab("Range shift (km/year)") +
    scale_color_viridis_c(direction = -1)+
    geom_point(alpha = .5, size = 3)+
    theme_bw()+
    geom_smooth(method = "lm", color = "black", size=1)+
    stat_smooth(geom="line", method = "lm", alpha=0.3, se = FALSE, 
                aes(by=as.factor(Article_ID)), color = "black")+
    facet_wrap(Group~Param, scales = "free")
## Warning in stat_smooth(geom = "line", method = "lm", alpha = 0.3, se = FALSE, :
## Ignoring unknown aesthetics: by
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'